import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
# Einlesen in ein Pandas DataFrame
df = pd.read_parquet('gt2011-gt2015.parquet')
# CO und NOX werden vorhergesagt (Zielspalten). Alle anderen Spalten können als Features verwendet werden
df
| AT | AP | AH | AFDP | GTEP | TIT | TAT | TEY | CDP | CO | NOX | Datetime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.5878 | 1018.7 | 83.675 | 3.5758 | 23.979 | 1086.2 | 549.83 | 134.67 | 11.898 | 0.32663 | 81.952 | 2011-01-01 00:00:00 |
| 1 | 4.2932 | 1018.3 | 84.235 | 3.5709 | 23.951 | 1086.1 | 550.05 | 134.67 | 11.892 | 0.44784 | 82.377 | 2011-01-01 01:10:55 |
| 2 | 3.9045 | 1018.4 | 84.858 | 3.5828 | 23.990 | 1086.5 | 550.19 | 135.10 | 12.042 | 0.45144 | 83.776 | 2011-01-01 02:21:50 |
| 3 | 3.7436 | 1018.3 | 85.434 | 3.5808 | 23.911 | 1086.5 | 550.17 | 135.03 | 11.990 | 0.23107 | 82.505 | 2011-01-01 03:32:45 |
| 4 | 3.7516 | 1017.8 | 85.182 | 3.5781 | 23.917 | 1085.9 | 550.00 | 134.67 | 11.910 | 0.26747 | 82.028 | 2011-01-01 04:43:40 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 36728 | 3.6268 | 1028.5 | 93.200 | 3.1661 | 19.087 | 1037.0 | 541.59 | 109.08 | 10.411 | 10.99300 | 89.172 | 2015-12-31 18:21:49 |
| 36729 | 4.1674 | 1028.6 | 94.036 | 3.1923 | 19.016 | 1037.6 | 542.28 | 108.79 | 10.344 | 11.14400 | 88.849 | 2015-12-31 19:33:00 |
| 36730 | 5.4820 | 1028.5 | 95.219 | 3.3128 | 18.857 | 1038.0 | 543.48 | 107.81 | 10.462 | 11.41400 | 96.147 | 2015-12-31 20:44:11 |
| 36731 | 5.8837 | 1028.7 | 94.200 | 3.9831 | 23.563 | 1076.9 | 550.11 | 131.41 | 11.771 | 3.31340 | 64.738 | 2015-12-31 21:55:22 |
| 36732 | 6.0392 | 1028.8 | 94.547 | 3.8752 | 22.524 | 1067.9 | 548.23 | 125.41 | 11.462 | 11.98100 | 109.240 | 2015-12-31 23:06:33 |
36733 rows × 12 columns
df.describe()
| AT | AP | AH | AFDP | GTEP | TIT | TAT | TEY | CDP | CO | NOX | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 36733.000000 | 36733.000000 | 36733.000000 | 36733.000000 | 36733.000000 | 36733.000000 | 36733.000000 | 36733.000000 | 36733.000000 | 36733.000000 | 36733.000000 |
| mean | 17.712726 | 1013.070165 | 77.867015 | 3.925518 | 25.563801 | 1081.428084 | 546.158517 | 133.506404 | 12.060525 | 2.372468 | 65.293067 |
| std | 7.447451 | 6.463346 | 14.461355 | 0.773936 | 4.195957 | 17.536373 | 6.842360 | 15.618634 | 1.088795 | 2.262672 | 11.678357 |
| min | -6.234800 | 985.850000 | 24.085000 | 2.087400 | 17.698000 | 1000.800000 | 511.040000 | 100.020000 | 9.851800 | 0.000388 | 25.905000 |
| 25% | 11.781000 | 1008.800000 | 68.188000 | 3.355600 | 23.129000 | 1071.800000 | 544.720000 | 124.450000 | 11.435000 | 1.182400 | 57.162000 |
| 50% | 17.801000 | 1012.600000 | 80.470000 | 3.937700 | 25.104000 | 1085.900000 | 549.880000 | 133.730000 | 11.965000 | 1.713500 | 63.849000 |
| 75% | 23.665000 | 1017.000000 | 89.376000 | 4.376900 | 29.061000 | 1097.000000 | 550.040000 | 144.080000 | 12.855000 | 2.842900 | 71.548000 |
| max | 37.103000 | 1036.600000 | 100.200000 | 7.610600 | 40.716000 | 1100.900000 | 550.610000 | 179.500000 | 15.159000 | 44.103000 | 119.910000 |
px.histogram(df, x='NOX', nbins=50)
# Verteilung von NOX über alle Jahre
px.histogram(df, x='CO', nbins=50)
# Verteilung von CO über alle Jahre
# Korrelationen zwischen allen Features. Einige korrelieren gut miteinander
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(df.drop(columns=['CO','NOX']).corr()))
heatmap = sns.heatmap(df.drop(columns=['CO','NOX']).corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation matrix of input variables', fontdict={'fontsize':18}, pad=16);
CO_list = []
NOX_list = []
for column in df.columns[:-2]:
CO_list.append(df['CO'].corr(df[column]))
NOX_list.append(df['NOX'].corr(df[column]))
corr_df = pd.DataFrame({'CO': CO_list, 'NOX': NOX_list},index=df.columns[:-2])
cm = sns.light_palette("green", as_cmap=True)
display(((corr_df).round(3)).style.background_gradient(cmap="PiYG",axis=None).format("{:3}"))
| CO | NOX | |
|---|---|---|
| AT | -0.174 | -0.558 |
| AP | 0.067 | 0.192 |
| AH | 0.107 | 0.165 |
| AFDP | -0.448 | -0.188 |
| GTEP | -0.519 | -0.202 |
| TIT | -0.706 | -0.214 |
| TAT | 0.058 | -0.093 |
| TEY | -0.57 | -0.116 |
| CDP | -0.551 | -0.171 |
| CO | 1.0 | 0.341 |
Verteilungen von Features über die ganze Zeit
cols = ['GTEP','TIT','TEY','CDP', 'AT']
for feature in cols:
fig = px.scatter(df, x='Datetime', y=feature)
fig.show()
train_data = df[df['Datetime'].dt.year.isin([2011, 2012, 2013])] # 2011-2013 sind Traindaten
cols = ['GTEP','TIT','TEY','CDP'] # Spalten mit den besten Korrelationen mit der Ziealvariable
X_train, y_train = train_data[cols], train_data['CO']
test_data = df[df['Datetime'].dt.year.isin([2014, 2015])] # 2014 und 2015 sind Testdaten
X_test, y_test = test_data[cols], test_data['CO']
#Defining MAPE function
def mape_func(Y_actual,Y_Predicted):
mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
return mape
def train_evaluate_model(model, X_train, y_train, X_test, y_test, scaler=None, params=None):
if params is not None:
mdl = model(**params)
else:
mdl = model()
if scaler is not None:
_scaler = scaler()
_scaler.fit(X_train)
X_train, X_test = _scaler.transform(X_train), _scaler.transform(X_test)
mdl.fit(X_train, y_train)
preds = mdl.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=preds)
rmse = mean_squared_error(y_true=y_test, y_pred=preds, squared=False)
mape = mape_func(y_test, preds)
return mdl, rmse, mape, mae, preds
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
xgb_regr, rmse, mape, mae, preds = train_evaluate_model(XGBRegressor, X_train, y_train, X_test, y_test,
scaler=StandardScaler,
params=dict(
learning_rate = 1,
max_depth=8,
n_estimators=1000,
min_child_weight=300,
colsample_bytree=0.8,
subsample=0.8,
eta=0.3,
seed=42,
eval_metric="rmse",
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=True,
early_stopping_rounds = 10
))
[19:16:15] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:576:
Parameters: { "early_stopping_rounds", "eval_set", "verbose" } might not be used.
This could be a false alarm, with some parameters getting used by language bindings but
then being mistakenly passed down to XGBoost core, or some parameter actually being used
but getting flagged wrongly here. Please open an issue if you find any such cases.
y_test
22191 1.9157
22192 2.0596
22193 2.1621
22194 2.1214
22195 2.1549
...
36728 10.9930
36729 11.1440
36730 11.4140
36731 3.3134
36732 11.9810
Name: CO, Length: 14542, dtype: float64
print(f'{rmse = }, {mape = }, {mae = }')
print('r2_score:', r2_score(y_test, preds))
rmse = 2.86669409499469, mape = 141.33000678083764, mae = 1.7464813855778245 r2_score: -0.7144636263378334
plt.scatter(x = preds, y = y_test)
<matplotlib.collections.PathCollection at 0x2b25296aa00>
y_test.index
Int64Index([22191, 22192, 22193, 22194, 22195, 22196, 22197, 22198, 22199,
22200,
...
36723, 36724, 36725, 36726, 36727, 36728, 36729, 36730, 36731,
36732],
dtype='int64', length=14542)
px.scatter(x = y_test,y = preds,trendline = 'ols')
# Scatter plots of test set predictions
plt.figure(figsize=(14, 12))
ax = plt.axes()
plt.scatter(preds, y_test)
plt.plot(np.unique(preds), np.poly1d(np.polyfit(preds,y_test ,1))(np.unique(preds)), color='red')
plt.text(30, 30, 'R-squared = %0.2f' % r2_score(y_test, preds))
plt.title('Scatter plots of test set predictions', fontsize = 20, pad = 30)
plt.xlabel('XGBRegressor Prediction - CO', fontsize=16)
plt.ylabel('Sensor Measurement - CO', fontsize=18)
plt.show()
# gradient boosting for regression in scikit-learn
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, random_state=1)
# evaluate the model
model = GradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# fit the model on the whole dataset
model = GradientBoostingRegressor()
model.fit(X, y)
# make a single prediction
row = [[2.02220122, 0.31563495, 0.82797464, -0.30620401, 0.16003707, -1.44411381, 0.87616892, -0.50446586, 0.23009474, 0.76201118]]
yhat = model.predict(row)
print('Prediction: %.3f' % yhat[0])
MAE: -11.870 (1.142) Prediction: -80.661
model = GradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
n_scores
array([-0.66278579, -0.65700484, -0.62180465, -0.708055 , -0.59317205,
-0.64201897, -0.64468802, -0.65095366, -0.62433063, -0.67769352,
-0.60364151, -0.67313971, -0.65073171, -0.65470979, -0.67608425,
-0.63382767, -0.63865088, -0.66384182, -0.63741769, -0.65569141,
-0.65453857, -0.66577694, -0.69790018, -0.64927991, -0.5862866 ,
-0.63115048, -0.62088021, -0.64868003, -0.63959681, -0.67975392])
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
MAE: -0.648 (0.027)
model = GradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
n_scores
array([0.66429627, 0.63288592, 0.77351219, 0.59853198, 0.78952452,
0.69098979, 0.73189348, 0.67867599, 0.82022961, 0.61807817,
0.74949713, 0.63913106, 0.7165963 , 0.7615254 , 0.61326989,
0.73875143, 0.68315063, 0.55928474, 0.74601153, 0.73267799,
0.75864194, 0.56324606, 0.51075117, 0.74452793, 0.82855495,
0.74089043, 0.76947559, 0.61139742, 0.71161286, 0.6763386 ])
print('R^2: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
R^2: 0.695 (0.079)
from sklearn.model_selection import cross_validate
model = GradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_absolute_error','r2'], cv=cv, n_jobs=-1, error_score='raise')
n_scores
{'fit_time': array([2.73044634, 2.82432461, 3.01813054, 2.77291369, 2.78607941,
2.84139872, 3.03807402, 2.86033583, 4.00799179, 3.92121863,
4.00518537, 4.18254185, 3.95114112, 4.37224531, 4.04361463,
3.91802716, 3.82704616, 3.62835598, 3.94688249, 3.75279427,
4.1317327 , 4.03492212, 3.67302108, 3.99550772, 3.40351343,
3.36476803, 3.3203733 , 3.40876389, 3.37674093, 3.22577477]),
'score_time': array([0.00499916, 0.01163411, 0.01108456, 0.00676107, 0.00600982,
0.00551176, 0.00623584, 0.00508356, 0.01878142, 0.00699854,
0.00773144, 0.00850773, 0.00774884, 0.00900054, 0.00801635,
0.00800228, 0.01019812, 0.01065731, 0.00904608, 0.00599933,
0.00699949, 0.01050878, 0.01204658, 0.00733185, 0.00750923,
0.00700188, 0.00561929, 0.00499511, 0.00600314, 0.00559402]),
'test_neg_mean_absolute_error': array([-0.66322213, -0.65700484, -0.62094827, -0.70849204, -0.59317205,
-0.64234961, -0.6443987 , -0.65059422, -0.62482857, -0.67719991,
-0.60424009, -0.67294521, -0.64827387, -0.655082 , -0.67663542,
-0.63294454, -0.63718027, -0.66389262, -0.63793847, -0.6541169 ,
-0.65415145, -0.66708513, -0.69712318, -0.64882283, -0.58590927,
-0.63124477, -0.62047674, -0.64868003, -0.63959681, -0.67866662]),
'test_r2': array([0.66155041, 0.63288592, 0.77168545, 0.60386681, 0.78932259,
0.68882752, 0.73072393, 0.67852338, 0.820106 , 0.61642389,
0.7437282 , 0.64026538, 0.72824009, 0.75952094, 0.6227231 ,
0.7388297 , 0.6803026 , 0.55547744, 0.74235652, 0.74290451,
0.7579204 , 0.56039961, 0.51458678, 0.74390926, 0.82855495,
0.74075961, 0.76971643, 0.6130633 , 0.71161286, 0.67347914])}
model = GradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_validate(model, X_test, y_test, scoring=['r2','neg_mean_absolute_error','neg_mean_absolute_percentage_error'], cv=cv, n_jobs=-1, error_score='raise')
n_scores
{'fit_time': array([3.12302947, 2.91563535, 3.32985783, 3.30308652, 2.7363615 ,
2.69050598, 2.90258813, 3.04057026, 3.6119988 , 3.20310187,
3.2887876 , 3.72162986, 3.14731646, 3.84771395, 3.74306393,
3.19097328, 3.16698265, 3.09482098, 3.34186053, 3.20370293,
3.14208603, 3.90983391, 3.87790513, 3.96253705, 3.20800543,
3.31690502, 3.21393013, 3.48509407, 3.12201166, 2.91965747]),
'score_time': array([0.00900006, 0.0156188 , 0.00700426, 0.01029038, 0.01300311,
0.00800276, 0.00900292, 0.00900078, 0.01281261, 0.0100019 ,
0.01276016, 0.0100677 , 0.00854683, 0.0090034 , 0.00989032,
0.03270364, 0.00892544, 0.00800061, 0.00845647, 0.00901246,
0.00799918, 0.0148232 , 0.00699759, 0.01507306, 0.00899076,
0.00781083, 0.00678372, 0.00646639, 0.00700068, 0.00740695]),
'test_r2': array([0.6155996 , 0.69794799, 0.821653 , 0.54329966, 0.84102294,
0.77441783, 0.79078996, 0.60461258, 0.6821228 , 0.6779364 ,
0.69166585, 0.69526843, 0.81623021, 0.50392442, 0.68982796,
0.68823315, 0.84073881, 0.78032614, 0.68349058, 0.73097318,
0.65362426, 0.79613564, 0.73484813, 0.71533087, 0.55999626,
0.64897811, 0.75629497, 0.71634413, 0.7038063 , 0.73689006]),
'test_neg_mean_absolute_error': array([-0.61633142, -0.57006377, -0.58436439, -0.59730161, -0.55260695,
-0.58179754, -0.56444574, -0.62105523, -0.64267484, -0.61543037,
-0.57751233, -0.59132387, -0.55877011, -0.69042467, -0.57610401,
-0.61773404, -0.556246 , -0.60700168, -0.58855775, -0.61316393,
-0.6160338 , -0.59334755, -0.60515288, -0.58196936, -0.64198267,
-0.61830552, -0.56203311, -0.60840549, -0.58642855, -0.56229006]),
'test_neg_mean_absolute_percentage_error': array([-0.96961661, -0.87338005, -0.3703086 , -2.00589047, -0.47575136,
-0.39012822, -0.34230467, -0.56051626, -1.06933129, -0.49614409,
-0.67539913, -0.6629671 , -0.49853711, -0.99669609, -0.50356315,
-0.67575365, -1.03852983, -0.44302825, -0.59107302, -1.39112329,
-0.66863512, -0.41658177, -1.81982309, -0.3670135 , -0.44929939,
-0.53186817, -0.60170586, -0.75717053, -0.58459262, -1.16845293])}
from sklearn.model_selection import cross_val_predict
preds = cross_val_predict(model, X_test, y_test, cv=10)
# Scatter plots of test set predictions
plt.figure(figsize=(14, 12))
ax = plt.axes()
plt.scatter(preds, y_test)
plt.plot(np.unique(preds), np.poly1d(np.polyfit(preds,y_test ,1))(np.unique(preds)), color='red')
plt.title('Scatter plots of test set predictions', fontsize = 20, pad = 30)
plt.xlabel('XGBRegressor Prediction - CO', fontsize=16)
plt.ylabel('Sensor Measurement - CO', fontsize=18)
plt.show()
model = GradientBoostingRegressor()
model.fit(X, y)
GradientBoostingRegressor()
df.values[500]
array([6.8968, 1008.2, 93.336, 3.7909, 23.888, 1083.3, 548.85, 135.13,
11.865, 1.361, 78.764, Timestamp('2011-01-25 14:58:20')],
dtype=object)
row = [[0, 0, 0, 0, 0, 1200, 100, 0,
0, 1.361]]
yhat = model.predict(row)
print('Prediction: %.3f' % yhat[0])
Prediction: 204.591
Y_train.values
array([0.32663, 0.44784, 0.45144, ..., 1.0472 , 1.0875 , 1.1337 ])
from sklearn.model_selection import cross_val_score
from sklearn import svm
clf = svm.SVC(kernel='linear', C=1).fit(X_train.values, y_train.values)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-165-62ab5ec2e887> in <module> 1 from sklearn.model_selection import cross_val_score 2 from sklearn import svm ----> 3 clf = svm.SVC(kernel='linear', C=1).fit(X_train.values, y_train.values) 4 ~\anaconda3\lib\site-packages\sklearn\svm\_base.py in fit(self, X, y, sample_weight) 197 ) 198 --> 199 y = self._validate_targets(y) 200 201 sample_weight = np.asarray( ~\anaconda3\lib\site-packages\sklearn\svm\_base.py in _validate_targets(self, y) 714 def _validate_targets(self, y): 715 y_ = column_or_1d(y, warn=True) --> 716 check_classification_targets(y) 717 cls, y = np.unique(y_, return_inverse=True) 718 self.class_weight_ = compute_class_weight(self.class_weight, classes=cls, y=y_) ~\anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y) 196 "multilabel-sequences", 197 ]: --> 198 raise ValueError("Unknown label type: %r" % y_type) 199 200 ValueError: Unknown label type: 'continuous'
def plot_preds_on_true_vals(y_true, y_preds, x, title, kind='line'):
plt.figure(figsize=(14, 12))
if kind == 'line':
ax = plt.axes()
sns.lineplot(x=x, y=y_true)
sns.lineplot(x=x, y=y_preds)
ax.legend(['true', 'predicted'])
else:
sns.scatterplot(x=y_true, y=y_preds)
plt.title(title, fontsize = 20, pad = 30)
plt.show()
y_train.values.shape
(22191,)
mean = df['CO'].mean()
mean_preds = y_test.shape [0] * [mean]
mae = mean_absolute_error(y_true=y_test, y_pred=mean_preds)
rmse = mean_squared_error(y_true=y_test, y_pred=mean_preds, squared=False)
mape = mape_func(y_test, mean_preds)
print(f'{rmse = }, {mape = }, {mae = }')
print('r2_score:', r2_score(y_test, mean_preds))
rmse = 2.2026093836918292, mape = 95.26849435368567, mae = 1.2639184662715204 r2_score: -0.012139891766259625
plot_preds_on_true_vals(y_test, mean_preds, dates, 'CO predictions with XGBRegressor', 'scatter')
dates = df[df['Datetime'].dt.year.isin([2014, 2015])]['Datetime']
plot_preds_on_true_vals(y_test, preds, dates, 'CO predictions with XGBRegressor', 'scatter')
plot_preds_on_true_vals(y_test, preds, dates, 'CO predictions with XGBRegressor')
g_B_regr, rmse, mape, mae, preds = train_evaluate_model(GradientBoostingRegressor, X_train, y_train, X_test, y_test, scaler=StandardScaler,
params={'random_state':1})
print(f'{rmse = }, {mape = }, {mae = }')
print('r2_score:', r2_score(y_test, preds))
plot_preds_on_true_vals(y_test, preds, dates, 'CO predictions with GBRegressor', 'scatter')
plot_preds_on_true_vals(y_test, preds, dates, 'CO predictions with GBRegressor')
train_data = df[df['Datetime'].dt.year.isin([2011, 2012, 2013])]
cols = ['AT', 'TIT']
X_train, y_train = train_data[cols], train_data['NOX']
test_data = df[df['Datetime'].dt.year.isin([2014, 2015])]
X_test, y_test = test_data[cols], test_data['NOX']
nox_regr, rmse, mape, mae, preds = train_evaluate_model(GradientBoostingRegressor, X_train, y_train,
X_test, y_test, scaler=StandardScaler,
params={'random_state':0})
print(f'{rmse = }, {mape = }, {mae = }')
print('r2_score:', r2_score(y_test, preds))
rmse = 12.792864835206874, mape = 19.070318509881613, mae = 11.125044614716943 r2_score: -0.4629726823867981
plot_preds_on_true_vals(y_test, preds, dates, 'NOx predictions with GBRegressor', 'scatter')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-27-875aabebd0fb> in <module> ----> 1 plot_preds_on_true_vals(y_test, preds, dates, 'NOx predictions with GBRegressor', 'scatter') NameError: name 'plot_preds_on_true_vals' is not defined
plot_preds_on_true_vals(y_test, preds, dates, 'NOx predictions with GBRegressor')
noxXGB_regr, rmse, mape, mae, preds = train_evaluate_model(XGBRegressor, X_train, y_train, X_test, y_test,
scaler=StandardScaler,
params=dict(
learning_rate = 1,
max_depth=8,
n_estimators=1000,
min_child_weight=300,
colsample_bytree=0.8,
subsample=0.8,
eta=0.3,
seed=42,
eval_metric="mape"
))
print(f'{rmse = } {mape = } {mae = }')
print('r2_score', r2_score(y_test, preds))
rmse = 12.82291449399772 mape = 18.649174381876243 mae = 10.63866234095981 r2_score -0.46985362146038967
model = GradientBoostingRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_validate(model, X_test, y_test, scoring=['r2','neg_mean_absolute_error','neg_mean_absolute_percentage_error'], cv=cv, n_jobs=-1, error_score='raise')
n_scores
{'fit_time': array([1.48960042, 1.46246743, 1.50077581, 1.49060059, 1.48681617,
1.4695425 , 1.48202181, 1.50899053, 2.09743333, 2.34062672,
2.17504573, 1.98965001, 2.37942314, 2.01911044, 2.30329442,
2.02357197, 2.01070857, 2.0472343 , 2.05860591, 1.9180584 ,
2.10779643, 2.29371142, 2.0540247 , 1.99327397, 1.73602939,
1.68089724, 1.65935397, 1.65661478, 1.62016201, 1.56364346]),
'score_time': array([0.008178 , 0.00895286, 0.00882936, 0.00815368, 0.00841045,
0.00768709, 0.00958037, 0.00856948, 0.00902081, 0.02429485,
0.01088047, 0.00905323, 0.01402307, 0.00899792, 0.01165986,
0.0097816 , 0.0275178 , 0.02440095, 0.00801897, 0.0082159 ,
0.00793266, 0.01044273, 0.00822973, 0.00903535, 0.00680232,
0.00804186, 0.00890684, 0.00598812, 0.00608063, 0.00503492]),
'test_r2': array([0.69309004, 0.66510799, 0.71764882, 0.65997332, 0.68039428,
0.722383 , 0.72664843, 0.6989538 , 0.69933975, 0.68231829,
0.70099106, 0.67579413, 0.67839188, 0.69923823, 0.68676743,
0.6793055 , 0.69332942, 0.68447187, 0.70147449, 0.7369884 ,
0.69389837, 0.69893958, 0.72593246, 0.69827219, 0.66208984,
0.69514624, 0.67892292, 0.66189445, 0.70061672, 0.7120314 ]),
'test_neg_mean_absolute_error': array([-4.14140869, -4.1632491 , -4.00470272, -4.28716459, -3.95487528,
-4.05632226, -3.86910955, -3.98723727, -4.23408949, -4.12058301,
-4.01668849, -4.13298934, -3.99196546, -3.90171691, -4.16690185,
-4.12747196, -4.16696663, -4.16694474, -3.99913596, -4.18220962,
-4.08631524, -4.14466486, -4.12092102, -4.11498114, -4.20354715,
-4.05634899, -3.9285421 , -4.26002785, -4.01027029, -3.99651283]),
'test_neg_mean_absolute_percentage_error': array([-0.06947731, -0.06894909, -0.06674913, -0.07185383, -0.06737664,
-0.06692146, -0.06414317, -0.06668627, -0.06994729, -0.06855203,
-0.06677289, -0.06884999, -0.06722674, -0.06408676, -0.0703293 ,
-0.06810225, -0.06947282, -0.06927785, -0.067058 , -0.06979385,
-0.0682248 , -0.06721178, -0.0676404 , -0.06948572, -0.07040824,
-0.06765006, -0.06562673, -0.07186061, -0.06756527, -0.06649682])}
n_scores['test_r2'].mean()
0.693686274134205
n_scores['test_neg_mean_absolute_error'].mean()*(-1)
4.086398323314884
from sklearn.model_selection import cross_val_predict
preds = cross_val_predict(model, X_test, y_test, cv=10)
# Scatter plots of test set predictions
plt.figure(figsize=(14, 12))
ax = plt.axes()
plt.scatter(preds, y_test)
plt.plot(np.unique(preds), np.poly1d(np.polyfit(preds,y_test ,1))(np.unique(preds)), color='red')
plt.title('Scatter plots of test set predictions', fontsize = 20, pad = 30)
plt.xlabel('XGBRegressor Prediction - NOx', fontsize=16)
plt.ylabel('Sensor Measurement - NOx', fontsize=18)
plt.show()
# Scatter plots of test set predictions
plt.figure(figsize=(14, 12))
ax = plt.axes()
plt.scatter(preds, y_test)
plt.plot(np.unique(preds), np.poly1d(np.polyfit(preds,y_test ,1))(np.unique(preds)), color='red')
plt.text(100, 40, 'R-squared = %0.2f' % r2_score(y_test, preds))
plt.title('Scatter plots of test set predictions', fontsize = 20, pad = 30)
plt.xlabel('XGBRegressor Prediction - NOx', fontsize=16)
plt.ylabel('Sensor Measurement - NOx', fontsize=18)
plt.show()
plot_preds_on_true_vals( preds,y_test, dates, 'NOx predictions with XGBRegressor', 'scatter')
plot_preds_on_true_vals(y_test, preds, dates, 'NOx predictions with XGBRegressor')
from sklearn.neural_network import MLPRegressor
noxMLX_regr, rmse, mape, mae, preds = train_evaluate_model(MLPRegressor, X_train, y_train, X_test, y_test,
scaler=StandardScaler,
params=dict(
random_state=1,
max_iter=500,
early_stopping=True
))
print(f'{rmse = } {mape = } {mae = }')
print('r2_score', r2_score(y_test, preds))
plot_preds_on_true_vals(y_test, preds, dates, 'NOx predictions with MLPRegressor', 'scatter')
plot_preds_on_true_vals(y_test, preds, dates, 'NOx predictions with MLPRegressor')
train_data = df[df['Datetime'].dt.year.isin([2011])]
cols = ['AT', 'TIT']
cols = ['GTEP','TIT','TEY','CDP']
X_train, y_train = train_data[cols], train_data['CO']
test_data = df[df['Datetime'].dt.year.isin([2012])]
X_test, y_test = test_data[cols], test_data['CO']
one_year_model, rmse, mape, mae, preds = train_evaluate_model(
XGBRegressor, X_train, y_train, X_test, y_test,
scaler=StandardScaler,
params=dict(
max_depth=8,
n_estimators=1000,
min_child_weight=300,
colsample_bytree=0.8,
subsample=0.8,
eta=0.3,
seed=42,
eval_metric="rmse",
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=True,
early_stopping_rounds = 10
)
)
week_1_co = df[(df['Datetime'].dt.isocalendar().week == 1) & (df['Datetime'].dt.year == 2012)].iloc[:100, 'CO']
week_1_X = df[(df['Datetime'].dt.isocalendar().week == 1) & (df['Datetime'].dt.year == 2012)].iloc[:100, cols]
week_1_preds = one_year_model.predict(week_1_X)
dates = df[(df['Datetime'].dt.isocalendar().week == 1) & (df['Datetime'].dt.year == 2012)]['Datetime']
plot_preds_on_true_vals(week_1_co, week_1_preds, dates, 'Predictive systems')
# predict the train/test data and Calculate the standard uncertainty of the emission model
n = len(X_train)
df_train = X_train.join(y_train)
df_train['preds'] = reg.predict(X_train)
df_train
# the deviation between the emission concentration measured in the field
# and the concentration predicted by the PEMS pair
df_train['diff'] = np.abs(df_train['preds'] - df_train['NOX'])
df_train
# the mean of the relative deviation
delta_m = (df_train['diff'].sum())/len(df_train)
delta_m
# calculate u_model of the train data
df_train['deltas_diff'] = (df_train['diff'] - delta_m)**2
u_train = np.sqrt(df_train['deltas_diff'].sum()/(n-1))
u_train
df_test = X_test.join(y_test)
df_test['preds'] = reg.predict(X_test)
df_test['y_test'] = y_test
df_test['diff'] = np.abs(df_test['preds'] - df_test['y_test'])
delta_m = (df_test['diff'].sum())/len(df_test)
df_test['deltas_diff'] = (df_test['diff'] - delta_m)**2
u_test = np.sqrt(df_test['deltas_diff'].sum()/(len(df_test)-1))
u_test
u_test = np.sqrt(df_test['deltas_diff'].sum()/(len(df_test-1)))
u_test
u_pems = np.sqrt(u_train**2 + u_test**2)
u_pems
c_pems = df_train['NOX'].max() - df_train['NOX'].min()
c_pems
U_pems = 1.96*u_pems/c_pems
U_pems
import numpy as np
from scipy.stats import t
x = df[['NOX']]
m = x.mean()
s = x.std()
dof = len(x)-1
confidence = 0.95
dof
t_crit = np.abs(t.ppf((1-confidence)/2,dof))
t_crit
(m-s*t_crit/np.sqrt(len(x)), m+s*t_crit/np.sqrt(len(x)))
s*t_crit/np.sqrt(len(x))